In [1]:
# Computations
import pandas as pd
import numpy as np

from sklearn.metrics import confusion_matrix, roc_curve, auc, classification_report, plot_confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score, KFold

# sklearn
from sklearn.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn import metrics
from sklearn.feature_selection import RFE
from sklearn.utils.fixes import loguniform
from sklearn.model_selection import KFold

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.ensemble import VotingClassifier


# preprocessing
from sklearn.preprocessing import StandardScaler

# Visualisation libraries

## Text
from colorama import Fore, Back, Style
from IPython.display import Image, display, Markdown, Latex

## seaborn
import seaborn as sns
sns.set_context("paper", rc={"font.size":12,"axes.titlesize":14,"axes.labelsize":12})
sns.set_style("white")

## matplotlib
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse, Polygon
import matplotlib.gridspec as gridspec
import matplotlib.colors
from pylab import rcParams
plt.style.use('seaborn-whitegrid')
plt.rcParams['figure.figsize'] = 14, 8
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['text.color'] = 'k'
%matplotlib inline

## plotly
from plotly.offline import init_notebook_mode, iplot 
import plotly.graph_objs as go
import plotly.offline as py
from plotly.subplots import make_subplots
import plotly.express as px
# Graphics in retina format 
%config InlineBackend.figure_format = 'retina' 

import warnings
warnings.filterwarnings("ignore")

Anomaly Detection

Anomaly detection is a classification process in which rare items, events, or observations in data sets are identified. Learn more about this here. In this article, we investigate Credit Card Fraud Detection dataset from Kaggle.com.

Credit Card Fraud Detection

Context

Credit card companies must be able to recognize fraudulent credit card transactions so that customers are not charged for items that they did not purchase.

Content

The datasets contain transactions made by credit cards in September 2013 by European cardholders. This dataset presents transactions that occurred in two days, where we have 492 frauds out of 284,807 transactions. The dataset is highly unbalanced, the positive class (frauds) account for 0.172% of all transactions. It contains only numerical input variables which are the result of a PCA transformation. Unfortunately, due to confidentiality issues, we cannot provide the original features and more background information about the data. Features V1, V2, … V28 are the principal components obtained with PCA, the only features which have not been transformed with PCA are 'Time' and 'Amount'. Feature 'Time' contains the seconds elapsed between each transaction and the first transaction in the dataset. The feature 'Amount' is the transaction Amount, this feature can be used for example-dependant cost-sensitive learning. Feature 'Class' is the response variable and it takes value 1 in case of fraud and 0 otherwise.

In [2]:
Data = pd.read_csv('Data/creditcard.csv',sep=',')

Col = []
#         Temp = re.findall("(\d+)", s)
for s in Data.columns:
    if any(map(str.isdigit, s)) == True:
        Temp = s.split('V')
        Col.append('V'+ Temp[-1].zfill(2))
    else:
        Col.append(s)

Data.columns = Col
del Col

display(pd.DataFrame(Data.shape, columns = ['Count'], index = ['Attributes', 'Instances']).T)
Attributes Instances
Count 284807 31

Initial Analysis

From the above dataset, we can visualize the following features.

  • Amount
  • Class
  • Time

Transaction Class Distribution

In [3]:
Labels = ['Normal', 'Fraud']
Temp = Data['Class'].value_counts(sort = False).to_frame('Count').reset_index()
Temp.columns = ['Class','Count']
Temp['Class'] = Temp['Class'].map(lambda x: Labels[0] if x == 0 else Labels[1])
Temp['Percentage'] = np.round(100* Temp['Count'].values /Temp['Count'].sum(), 2)
display(Temp.style.hide_index().set_precision(2))

fig = px.bar(Temp, y= 'Class', x= 'Percentage', orientation='h', text = 'Count', color_discrete_sequence= ['Bisque'],
             height= 220)
fig.update_traces(marker_line_color= 'DarkRed', marker_line_width=1.5, opacity=1)
fig.update_traces(texttemplate='%{text:.2}', textposition='inside')
fig.update_layout(uniformtext_minsize= 8, uniformtext_mode='hide')
fig['layout']['xaxis'].update(range=[0, 100])
fig.update_layout(title = 'Transaction Class Distribution', plot_bgcolor= 'white')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.show()
Class Count Percentage
Normal 284315 99.83
Fraud 492 0.17

As can be seen, nearly, 99.83 percent of the dataset are labeled as Normal.

Number of Transactions vs Amount of Transactions

In [4]:
fig, ax = plt.subplots(1, 1, figsize=(16, 6))
_ = ax.hist(Data.loc[Data.Class == 0, 'Amount'], 100, color = '#34495e', hatch = '/', lw = 1.5,
            edgecolor = '#3498db', label = Labels[0])
_ = ax.hist(Data.loc[Data.Class == 1, 'Amount'], 10, Color = '#e74c3c', hatch = '\\', lw = 1.5,
            edgecolor = 'DarkRed', label = Labels[1])
_ = ax.set_xlabel('Amount')
_ = ax.set_ylabel('Number of Transactions')
_ = ax.set_xlim([0, 2e4])
_ = ax.set_yscale('log')
_ = ax.set_ylim([0, 1e6])
_ = ax.legend(bbox_to_anchor=(1, 1), fontsize=14, ncol=2)

Time vs Amount of Transactions

In [5]:
fig, ax = plt.subplots(1, 1, figsize=(16, 6))
_ = ax.scatter(Data.loc[Data.Class == 0, 'Time'], Data.loc[Data.Class == 0, 'Amount'], s= 30,
               facecolors='SkyBlue', edgecolors='MidnightBlue', alpha = 0.8, label = Labels[0])
_ = ax.scatter(Data.loc[Data.Class == 1, 'Time'], Data.loc[Data.Class == 1, 'Amount'], s= 30,
               facecolors='Orange', edgecolors='DarkRed', alpha = 1, label = Labels[1])
_ = ax.set_xlabel('Time (in seconds)')
_ = ax.set_ylabel('Amount')
_ = ax.set_xlim([-500, Data.Time.max()+500])
_ = ax.set_ylim([-250, 2e4])
_ = ax.legend(bbox_to_anchor=(1, 1), fontsize=14, ncol=2)

Modeling

The Dataset is quite large, we would like to use pandas DataFrame sample feature with using a one-tenth of the data as a sample.

In [6]:
df= Data.sample(frac = 0.1, random_state=1)

def Data_info(Inp, Only_NaN = False):
    Out = Inp.dtypes.to_frame(name='Data Type').sort_values(by=['Data Type'])
    Out = Out.join(Inp.isnull().sum().to_frame(name = 'Number of NaN Values'), how='outer')
    Out['Size'] = Inp.shape[0]
    Out['Percentage'] = np.round(100*(Out['Number of NaN Values']/Inp.shape[0]),2)
    if Only_NaN:
        Out = Out.loc[Out['Number of NaN Values']>0]
    return Out
display(df.head())
Data_info(df)
Time V01 V02 V03 V04 V05 V06 V07 V08 V09 ... V21 V22 V23 V24 V25 V26 V27 V28 Amount Class
169876 119907.0 -0.611712 -0.769705 -0.149759 -0.224877 2.028577 -2.019887 0.292491 -0.523020 0.358468 ... -0.075208 0.045536 0.380739 0.023440 -2.220686 -0.201146 0.066501 0.221180 1.79 0
127467 78340.0 -0.814682 1.319219 1.329415 0.027273 -0.284871 -0.653985 0.321552 0.435975 -0.704298 ... -0.128619 -0.368565 0.090660 0.401147 -0.261034 0.080621 0.162427 0.059456 1.98 0
137900 82382.0 -0.318193 1.118618 0.969864 -0.127052 0.569563 -0.532484 0.706252 -0.064966 -0.463271 ... -0.305402 -0.774704 -0.123884 -0.495687 -0.018148 0.121679 0.249050 0.092516 0.89 0
21513 31717.0 -1.328271 1.018378 1.775426 -1.574193 -0.117696 -0.457733 0.681867 -0.031641 0.383872 ... -0.220815 -0.419013 -0.239197 0.009967 0.232829 0.814177 0.098797 -0.004273 15.98 0
134700 80923.0 1.276712 0.617120 -0.578014 0.879173 0.061706 -1.472002 0.373692 -0.287204 -0.084482 ... -0.160161 -0.430404 -0.076738 0.258708 0.552170 0.370701 -0.034255 0.041709 0.76 0

5 rows × 31 columns

Out[6]:
Data Type Number of NaN Values Size Percentage
Amount float64 0 28481 0.0
Class int64 0 28481 0.0
Time float64 0 28481 0.0
V01 float64 0 28481 0.0
V02 float64 0 28481 0.0
V03 float64 0 28481 0.0
V04 float64 0 28481 0.0
V05 float64 0 28481 0.0
V06 float64 0 28481 0.0
V07 float64 0 28481 0.0
V08 float64 0 28481 0.0
V09 float64 0 28481 0.0
V10 float64 0 28481 0.0
V11 float64 0 28481 0.0
V12 float64 0 28481 0.0
V13 float64 0 28481 0.0
V14 float64 0 28481 0.0
V15 float64 0 28481 0.0
V16 float64 0 28481 0.0
V17 float64 0 28481 0.0
V18 float64 0 28481 0.0
V19 float64 0 28481 0.0
V20 float64 0 28481 0.0
V21 float64 0 28481 0.0
V22 float64 0 28481 0.0
V23 float64 0 28481 0.0
V24 float64 0 28481 0.0
V25 float64 0 28481 0.0
V26 float64 0 28481 0.0
V27 float64 0 28481 0.0
V28 float64 0 28481 0.0

Data Correlations

First off, let's define $X$ and $y$ sets.

In [7]:
Target = 'Class'
X = df.drop(columns = [Target])
y = df[Target]

Now, let's take a look at the variance of the features.

In [8]:
display(X.var().sort_values(ascending = False).to_frame(name= 'Variance').T.style.set_precision(2))
Time Amount V01 V02 V03 V04 V05 V06 V07 V08 V09 V10 V11 V13 V12 V14 V15 V16 V17 V18 V19 V20 V21 V22 V23 V24 V25 V26 V27 V28
Variance 2264306246.92 73383.90 3.98 2.92 2.32 2.02 1.95 1.78 1.53 1.45 1.21 1.16 1.04 1.00 1.00 0.92 0.83 0.77 0.73 0.71 0.67 0.65 0.55 0.53 0.42 0.36 0.27 0.24 0.17 0.10

As can see some of the variables have high variance and this is not desirable for our modeling. Thus, we would like to standardize features by removing the mean and scaling to unit variance. In this article, we demonstrated the benefits of scaling data using StandardScaler().

In [9]:
Temp = X.columns
X = StandardScaler().fit_transform(X)
X = pd.DataFrame(X, columns = Temp)
del Temp

Correlations of features with Class.

In [10]:
Temp = pd.DataFrame(X, columns = df.drop(columns = [Target]).columns)
Temp[Target] = y

def Correlation_Plot (Df,Fig_Size):
    Correlation_Matrix = Df.corr().round(2)
    mask = np.zeros_like(Correlation_Matrix)
    mask[np.triu_indices_from(mask)] = True
    for i in range(len(mask)):
        mask[i,i]=0
    Fig, ax = plt.subplots(figsize=(Fig_Size,Fig_Size))
    sns.heatmap(Correlation_Matrix, ax=ax, mask=mask, annot=True, square=True, 
                cmap =sns.color_palette("Greens", n_colors=10), linewidths = 0.2, vmin=0, vmax=1, cbar_kws={"shrink": .6})

Correlation_Plot (Temp, 16)

Train and Test Sets

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
pd.DataFrame(data={'Set':['X_train','X_test','y_train','y_test'],
               'Shape':[X_train.shape, X_test.shape, y_train.shape, y_test.shape]}).set_index('Set').T
Out[11]:
Set X_train X_test y_train y_test
Shape (19936, 30) (8545, 30) (19936,) (8545,)

A number of functions that we will be using throughout this article.

In [12]:
def Performance(clf, X_test = X_test):
    df = pd.DataFrame()
    y_pred = clf.predict(X_test)
    df = df.append({'Score': clf.score(X_test, y_test),
                    'F1 Score': f1_score(y_test.values, y_pred, average= 'weighted'),
                    'Precision Score': precision_score(y_test.values, y_pred, average= 'weighted'),
                    'Recall Score':  recall_score(y_test.values, y_pred, average= 'weighted')}, ignore_index=True)
    display(df.style.hide_index())

def highlight_max(s):
    is_max = s == s.max()
    return ['background-color: SpringGreen' if v else '' for v in is_max]


def Feature_Ranking(clf):
    df = pd.DataFrame()
    for n in range(2, X.shape[1]):
        selector = RFE(estimator= clf, n_features_to_select=n, verbose=0)
        selector.fit(X_train, y_train)
        df = df.append({'Number of Features to Select': n,
                        'Score':metrics.accuracy_score(y_test, selector.predict(X_test)),
                        'Features': X.columns[selector.support_].tolist(),
                        'Best Features':X.columns[selector.ranking_ == 1].tolist()}, ignore_index=True)

    df = df[['Number of Features to Select', 'Score', 'Features', 'Best Features']]
    display(df.style.apply(highlight_max, subset=['Score']))
    return df.loc[df.Score == df.Score.max(), 'Features'].values[0]

def ROC_Curve(clf, X_test = X_test):
    # false positive rates, true positive rates and thresholds
    fpr, tpr, threshold = metrics.roc_curve(y_test, clf.predict_proba(X_test)[:,1])

    fig, ax = plt.subplots(1, 1, figsize=(5.5, 5.5))
    _ = ax.plot(fpr, tpr, lw=2, label = 'AUC = %0.2f' % metrics.auc(fpr, tpr))
    _ = ax.plot([0, 1], [0, 1],'r--', lw=2)
    _ = ax.legend(loc = 'lower right', fontsize = 14)
    delta = 0.01
    _ = ax.set_xlim([-delta,1+delta])
    _ = ax.set_ylim([-delta,1+delta])
    _ = ax.set_xlabel('False Positive Rate (FPR)')
    _ = ax.set_ylabel('True Positive Rate (TPR)')

Decision Tree Classifier

The first classifier that we use here is Decision Tree Classifier.

In [13]:
dtc = DecisionTreeClassifier()
_ = dtc.fit(X_train,y_train)
Performance(dtc)
ROC_Curve(dtc)
F1 Score Precision Score Recall Score Score
0.998171 0.998368 0.998011 0.998011

However, we could also use RFE from sklearn.feature_selection. This provides feature ranking with recursive feature elimination.

In [14]:
Best_Features = Feature_Ranking(dtc)
Number of Features to Select Score Features Best Features
0 2.000000 0.998479 ['V12', 'V17'] ['V12', 'V17']
1 3.000000 0.998713 ['V12', 'V17', 'V20'] ['V12', 'V17', 'V20']
2 4.000000 0.998830 ['V03', 'V12', 'V17', 'V20'] ['V03', 'V12', 'V17', 'V20']
3 5.000000 0.998596 ['V07', 'V12', 'V14', 'V17', 'V20'] ['V07', 'V12', 'V14', 'V17', 'V20']
4 6.000000 0.998830 ['V03', 'V07', 'V12', 'V14', 'V17', 'V20'] ['V03', 'V07', 'V12', 'V14', 'V17', 'V20']
5 7.000000 0.998479 ['V03', 'V07', 'V12', 'V14', 'V15', 'V17', 'V20'] ['V03', 'V07', 'V12', 'V14', 'V15', 'V17', 'V20']
6 8.000000 0.998245 ['V03', 'V06', 'V07', 'V12', 'V14', 'V15', 'V17', 'V20'] ['V03', 'V06', 'V07', 'V12', 'V14', 'V15', 'V17', 'V20']
7 9.000000 0.998479 ['V03', 'V06', 'V07', 'V12', 'V14', 'V17', 'V19', 'V20', 'V28'] ['V03', 'V06', 'V07', 'V12', 'V14', 'V17', 'V19', 'V20', 'V28']
8 10.000000 0.998245 ['V03', 'V06', 'V07', 'V12', 'V14', 'V15', 'V17', 'V19', 'V20', 'V28'] ['V03', 'V06', 'V07', 'V12', 'V14', 'V15', 'V17', 'V19', 'V20', 'V28']
9 11.000000 0.998245 ['V03', 'V06', 'V07', 'V12', 'V14', 'V15', 'V17', 'V19', 'V20', 'V28', 'Amount'] ['V03', 'V06', 'V07', 'V12', 'V14', 'V15', 'V17', 'V19', 'V20', 'V28', 'Amount']
10 12.000000 0.998128 ['V03', 'V06', 'V07', 'V08', 'V12', 'V14', 'V15', 'V17', 'V19', 'V20', 'V28', 'Amount'] ['V03', 'V06', 'V07', 'V08', 'V12', 'V14', 'V15', 'V17', 'V19', 'V20', 'V28', 'Amount']
11 13.000000 0.997894 ['V03', 'V06', 'V07', 'V08', 'V12', 'V14', 'V15', 'V17', 'V18', 'V19', 'V20', 'V28', 'Amount'] ['V03', 'V06', 'V07', 'V08', 'V12', 'V14', 'V15', 'V17', 'V18', 'V19', 'V20', 'V28', 'Amount']
12 14.000000 0.998011 ['V03', 'V06', 'V07', 'V08', 'V12', 'V14', 'V15', 'V17', 'V18', 'V19', 'V20', 'V21', 'V28', 'Amount'] ['V03', 'V06', 'V07', 'V08', 'V12', 'V14', 'V15', 'V17', 'V18', 'V19', 'V20', 'V21', 'V28', 'Amount']
13 15.000000 0.998011 ['V03', 'V06', 'V07', 'V08', 'V11', 'V12', 'V14', 'V15', 'V17', 'V18', 'V19', 'V20', 'V21', 'V28', 'Amount'] ['V03', 'V06', 'V07', 'V08', 'V11', 'V12', 'V14', 'V15', 'V17', 'V18', 'V19', 'V20', 'V21', 'V28', 'Amount']
14 16.000000 0.998011 ['V03', 'V06', 'V07', 'V08', 'V09', 'V11', 'V12', 'V13', 'V14', 'V15', 'V17', 'V18', 'V19', 'V20', 'V28', 'Amount'] ['V03', 'V06', 'V07', 'V08', 'V09', 'V11', 'V12', 'V13', 'V14', 'V15', 'V17', 'V18', 'V19', 'V20', 'V28', 'Amount']
15 17.000000 0.998011 ['V03', 'V06', 'V07', 'V08', 'V09', 'V10', 'V12', 'V13', 'V14', 'V15', 'V17', 'V18', 'V19', 'V20', 'V21', 'V28', 'Amount'] ['V03', 'V06', 'V07', 'V08', 'V09', 'V10', 'V12', 'V13', 'V14', 'V15', 'V17', 'V18', 'V19', 'V20', 'V21', 'V28', 'Amount']
16 18.000000 0.998011 ['V03', 'V06', 'V07', 'V08', 'V09', 'V10', 'V12', 'V13', 'V14', 'V15', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V28', 'Amount'] ['V03', 'V06', 'V07', 'V08', 'V09', 'V10', 'V12', 'V13', 'V14', 'V15', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V28', 'Amount']
17 19.000000 0.998128 ['V03', 'V06', 'V07', 'V08', 'V09', 'V12', 'V13', 'V14', 'V15', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V28', 'Amount'] ['V03', 'V06', 'V07', 'V08', 'V09', 'V12', 'V13', 'V14', 'V15', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V28', 'Amount']
18 20.000000 0.998011 ['V03', 'V06', 'V07', 'V08', 'V11', 'V12', 'V13', 'V14', 'V15', 'V17', 'V18', 'V19', 'V20', 'V22', 'V23', 'V24', 'V25', 'V26', 'V28', 'Amount'] ['V03', 'V06', 'V07', 'V08', 'V11', 'V12', 'V13', 'V14', 'V15', 'V17', 'V18', 'V19', 'V20', 'V22', 'V23', 'V24', 'V25', 'V26', 'V28', 'Amount']
19 21.000000 0.998011 ['V03', 'V06', 'V07', 'V08', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V17', 'V18', 'V19', 'V20', 'V22', 'V23', 'V24', 'V25', 'V26', 'V28', 'Amount'] ['V03', 'V06', 'V07', 'V08', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V17', 'V18', 'V19', 'V20', 'V22', 'V23', 'V24', 'V25', 'V26', 'V28', 'Amount']
20 22.000000 0.998011 ['V03', 'V05', 'V06', 'V07', 'V08', 'V11', 'V12', 'V13', 'V14', 'V15', 'V17', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount'] ['V03', 'V05', 'V06', 'V07', 'V08', 'V11', 'V12', 'V13', 'V14', 'V15', 'V17', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount']
21 23.000000 0.998128 ['V03', 'V05', 'V06', 'V07', 'V08', 'V10', 'V12', 'V13', 'V14', 'V15', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount'] ['V03', 'V05', 'V06', 'V07', 'V08', 'V10', 'V12', 'V13', 'V14', 'V15', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount']
22 24.000000 0.998011 ['V03', 'V04', 'V05', 'V06', 'V07', 'V08', 'V11', 'V12', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount'] ['V03', 'V04', 'V05', 'V06', 'V07', 'V08', 'V11', 'V12', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount']
23 25.000000 0.998128 ['V03', 'V04', 'V05', 'V06', 'V07', 'V08', 'V09', 'V10', 'V12', 'V13', 'V14', 'V15', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount'] ['V03', 'V04', 'V05', 'V06', 'V07', 'V08', 'V09', 'V10', 'V12', 'V13', 'V14', 'V15', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount']
24 26.000000 0.998011 ['V03', 'V04', 'V05', 'V06', 'V07', 'V08', 'V09', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount'] ['V03', 'V04', 'V05', 'V06', 'V07', 'V08', 'V09', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount']
25 27.000000 0.998011 ['V01', 'V02', 'V03', 'V04', 'V05', 'V06', 'V07', 'V08', 'V09', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount'] ['V01', 'V02', 'V03', 'V04', 'V05', 'V06', 'V07', 'V08', 'V09', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount']
26 28.000000 0.998011 ['V02', 'V03', 'V04', 'V05', 'V06', 'V07', 'V08', 'V09', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount'] ['V02', 'V03', 'V04', 'V05', 'V06', 'V07', 'V08', 'V09', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount']
27 29.000000 0.998011 ['V01', 'V02', 'V03', 'V04', 'V05', 'V06', 'V07', 'V08', 'V09', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount'] ['V01', 'V02', 'V03', 'V04', 'V05', 'V06', 'V07', 'V08', 'V09', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount']

Thus, we could use only fewer features and improve the results of classifications. The best features for the classifications are

In [15]:
print(Back.BLACK + Fore.CYAN + Style.NORMAL + 'Best Features:'+ Style.RESET_ALL + ' %s:' % ', '.join(Best_Features))
Best Features: V03, V12, V17, V20:
In [16]:
dtc = DecisionTreeClassifier()
_ = dtc.fit(X_train[Best_Features],y_train)
Performance(dtc, X_test[Best_Features])
ROC_Curve(dtc, X_test[Best_Features])
F1 Score Precision Score Recall Score Score
0.998830 0.998830 0.998830 0.998830
In [17]:
# Train set
y_pred = dtc.predict(X_train[Best_Features])
Confusion_Matrix = confusion_matrix(y_train, y_pred)

fig, ax = plt.subplots(1, 2, figsize=(15, 5))
fig.suptitle('Train Set', fontsize = 18)
_ = sns.heatmap(Confusion_Matrix, annot=True, annot_kws={"size": 14}, cmap="Blues", ax = ax[0],
               linewidths = 0.2, vmin=0, vmax=2e4, cbar_kws={"shrink": 1})
_ = ax[0].set_xlabel('Predicted labels')
_ = ax[0].set_ylabel('True labels'); 
_ = ax[0].set_title('Confusion Matrix');
_ = ax[0].xaxis.set_ticklabels(Labels)
_ = ax[0].yaxis.set_ticklabels(Labels)

Confusion_Matrix = Confusion_Matrix.astype('float') / Confusion_Matrix.sum(axis=1)[:, np.newaxis]
_ = sns.heatmap(Confusion_Matrix, annot=True, annot_kws={"size": 14}, cmap="Greens", ax = ax[1],
               linewidths = 0.2, vmin=0, vmax=1, cbar_kws={"shrink": 1})
_ = ax[1].set_xlabel('Predicted labels')
_ = ax[1].set_ylabel('True labels'); 
_ = ax[1].set_title('Normalized Confusion Matrix');
_ = ax[1].xaxis.set_ticklabels(Labels)
_ = ax[1].yaxis.set_ticklabels(Labels)

# Test set
y_pred = dtc.predict(X_test[Best_Features])
Confusion_Matrix = confusion_matrix(y_test, y_pred)

fig, ax = plt.subplots(1, 2, figsize=(15, 5))
fig.suptitle('Test Set', fontsize = 18)
_ = sns.heatmap(Confusion_Matrix, annot=True, annot_kws={"size": 14}, cmap="Blues", ax = ax[0],
                linewidths = 0.2, vmin=0, vmax=9e3, cbar_kws={"shrink": 1})
_ = ax[0].set_xlabel('Predicted labels')
_ = ax[0].set_ylabel('True labels'); 
_ = ax[0].set_title('Confusion Matrix');
_ = ax[0].xaxis.set_ticklabels(Labels)
_ = ax[0].yaxis.set_ticklabels(Labels)

Confusion_Matrix = Confusion_Matrix.astype('float') / Confusion_Matrix.sum(axis=1)[:, np.newaxis]
_ = sns.heatmap(Confusion_Matrix, annot=True, annot_kws={"size": 14}, cmap="Greens", ax = ax[1],
               linewidths = 0.2, vmin=0, vmax=1, cbar_kws={"shrink": 1})
_ = ax[1].set_xlabel('Predicted labels')
_ = ax[1].set_ylabel('True labels'); 
_ = ax[1].set_title('Normalized Confusion Matrix');
_ = ax[1].xaxis.set_ticklabels(Labels)
_ = ax[1].yaxis.set_ticklabels(Labels)

RFE can be very useful, especially for cases that the number of features is quite large.

Random Forest Classifier

In [18]:
rfc = RandomForestClassifier()
_ = rfc.fit(X_train,y_train)
Performance(rfc)
ROC_Curve(rfc)
F1 Score Precision Score Recall Score Score
0.999377 0.999380 0.999415 0.999415
In [19]:
Best_Features = Feature_Ranking(rfc)
Number of Features to Select Score Features Best Features
0 2.000000 0.999064 ['V12', 'V17'] ['V12', 'V17']
1 3.000000 0.999181 ['V12', 'V14', 'V17'] ['V12', 'V14', 'V17']
2 4.000000 0.998947 ['V11', 'V12', 'V14', 'V17'] ['V11', 'V12', 'V14', 'V17']
3 5.000000 0.998947 ['V07', 'V11', 'V12', 'V14', 'V17'] ['V07', 'V11', 'V12', 'V14', 'V17']
4 6.000000 0.999064 ['V07', 'V10', 'V11', 'V12', 'V14', 'V17'] ['V07', 'V10', 'V11', 'V12', 'V14', 'V17']
5 7.000000 0.999064 ['V07', 'V10', 'V11', 'V12', 'V14', 'V16', 'V17'] ['V07', 'V10', 'V11', 'V12', 'V14', 'V16', 'V17']
6 8.000000 0.999064 ['V07', 'V10', 'V11', 'V12', 'V14', 'V16', 'V17', 'V28'] ['V07', 'V10', 'V11', 'V12', 'V14', 'V16', 'V17', 'V28']
7 9.000000 0.999298 ['V07', 'V10', 'V11', 'V12', 'V14', 'V16', 'V17', 'V26', 'V28'] ['V07', 'V10', 'V11', 'V12', 'V14', 'V16', 'V17', 'V26', 'V28']
8 10.000000 0.999298 ['V01', 'V07', 'V08', 'V10', 'V11', 'V12', 'V14', 'V16', 'V17', 'V20'] ['V01', 'V07', 'V08', 'V10', 'V11', 'V12', 'V14', 'V16', 'V17', 'V20']
9 11.000000 0.999298 ['V01', 'V07', 'V08', 'V10', 'V11', 'V12', 'V14', 'V16', 'V17', 'V20', 'V28'] ['V01', 'V07', 'V08', 'V10', 'V11', 'V12', 'V14', 'V16', 'V17', 'V20', 'V28']
10 12.000000 0.999181 ['V01', 'V03', 'V07', 'V08', 'V10', 'V11', 'V12', 'V14', 'V16', 'V17', 'V20', 'V28'] ['V01', 'V03', 'V07', 'V08', 'V10', 'V11', 'V12', 'V14', 'V16', 'V17', 'V20', 'V28']
11 13.000000 0.999298 ['V04', 'V06', 'V07', 'V09', 'V10', 'V11', 'V12', 'V14', 'V16', 'V17', 'V18', 'V20', 'V28'] ['V04', 'V06', 'V07', 'V09', 'V10', 'V11', 'V12', 'V14', 'V16', 'V17', 'V18', 'V20', 'V28']
12 14.000000 0.999415 ['V01', 'V03', 'V06', 'V07', 'V10', 'V11', 'V12', 'V14', 'V16', 'V17', 'V19', 'V21', 'V26', 'V28'] ['V01', 'V03', 'V06', 'V07', 'V10', 'V11', 'V12', 'V14', 'V16', 'V17', 'V19', 'V21', 'V26', 'V28']
13 15.000000 0.999298 ['V01', 'V03', 'V06', 'V07', 'V08', 'V10', 'V11', 'V12', 'V14', 'V16', 'V17', 'V18', 'V19', 'V20', 'V28'] ['V01', 'V03', 'V06', 'V07', 'V08', 'V10', 'V11', 'V12', 'V14', 'V16', 'V17', 'V18', 'V19', 'V20', 'V28']
14 16.000000 0.999298 ['V01', 'V03', 'V04', 'V06', 'V07', 'V08', 'V09', 'V10', 'V11', 'V12', 'V14', 'V16', 'V17', 'V19', 'V20', 'V28'] ['V01', 'V03', 'V04', 'V06', 'V07', 'V08', 'V09', 'V10', 'V11', 'V12', 'V14', 'V16', 'V17', 'V19', 'V20', 'V28']
15 17.000000 0.999415 ['V01', 'V03', 'V04', 'V07', 'V08', 'V09', 'V10', 'V11', 'V12', 'V14', 'V16', 'V17', 'V18', 'V19', 'V20', 'V26', 'V28'] ['V01', 'V03', 'V04', 'V07', 'V08', 'V09', 'V10', 'V11', 'V12', 'V14', 'V16', 'V17', 'V18', 'V19', 'V20', 'V26', 'V28']
16 18.000000 0.999298 ['V01', 'V04', 'V05', 'V06', 'V07', 'V08', 'V09', 'V10', 'V11', 'V12', 'V14', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V28'] ['V01', 'V04', 'V05', 'V06', 'V07', 'V08', 'V09', 'V10', 'V11', 'V12', 'V14', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V28']
17 19.000000 0.999415 ['V01', 'V03', 'V04', 'V06', 'V07', 'V08', 'V09', 'V10', 'V11', 'V12', 'V14', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V26', 'V28'] ['V01', 'V03', 'V04', 'V06', 'V07', 'V08', 'V09', 'V10', 'V11', 'V12', 'V14', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V26', 'V28']
18 20.000000 0.999298 ['V01', 'V03', 'V04', 'V06', 'V07', 'V08', 'V09', 'V10', 'V11', 'V12', 'V13', 'V14', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V26', 'V28'] ['V01', 'V03', 'V04', 'V06', 'V07', 'V08', 'V09', 'V10', 'V11', 'V12', 'V13', 'V14', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V26', 'V28']
19 21.000000 0.999181 ['V01', 'V02', 'V03', 'V04', 'V06', 'V07', 'V08', 'V09', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V28'] ['V01', 'V02', 'V03', 'V04', 'V06', 'V07', 'V08', 'V09', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V28']
20 22.000000 0.999298 ['Time', 'V02', 'V03', 'V04', 'V05', 'V06', 'V07', 'V08', 'V09', 'V10', 'V11', 'V12', 'V13', 'V14', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V26', 'V28'] ['Time', 'V02', 'V03', 'V04', 'V05', 'V06', 'V07', 'V08', 'V09', 'V10', 'V11', 'V12', 'V13', 'V14', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V26', 'V28']
21 23.000000 0.999415 ['V01', 'V03', 'V04', 'V05', 'V06', 'V07', 'V08', 'V09', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V26', 'V27', 'V28'] ['V01', 'V03', 'V04', 'V05', 'V06', 'V07', 'V08', 'V09', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V26', 'V27', 'V28']
22 24.000000 0.999298 ['V01', 'V03', 'V04', 'V05', 'V06', 'V07', 'V08', 'V09', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V26', 'V27', 'V28', 'Amount'] ['V01', 'V03', 'V04', 'V05', 'V06', 'V07', 'V08', 'V09', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V26', 'V27', 'V28', 'Amount']
23 25.000000 0.999415 ['Time', 'V01', 'V02', 'V03', 'V04', 'V05', 'V06', 'V07', 'V08', 'V09', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V26', 'V28'] ['Time', 'V01', 'V02', 'V03', 'V04', 'V05', 'V06', 'V07', 'V08', 'V09', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V26', 'V28']
24 26.000000 0.999415 ['V01', 'V02', 'V03', 'V04', 'V05', 'V06', 'V07', 'V08', 'V09', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V23', 'V25', 'V26', 'V27', 'V28'] ['V01', 'V02', 'V03', 'V04', 'V05', 'V06', 'V07', 'V08', 'V09', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V23', 'V25', 'V26', 'V27', 'V28']
25 27.000000 0.999298 ['Time', 'V01', 'V02', 'V03', 'V04', 'V05', 'V06', 'V07', 'V08', 'V09', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V26', 'V28', 'Amount'] ['Time', 'V01', 'V02', 'V03', 'V04', 'V05', 'V06', 'V07', 'V08', 'V09', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V26', 'V28', 'Amount']
26 28.000000 0.999298 ['Time', 'V01', 'V02', 'V03', 'V04', 'V05', 'V06', 'V07', 'V08', 'V09', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V23', 'V24', 'V25', 'V26', 'V28', 'Amount'] ['Time', 'V01', 'V02', 'V03', 'V04', 'V05', 'V06', 'V07', 'V08', 'V09', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V23', 'V24', 'V25', 'V26', 'V28', 'Amount']
27 29.000000 0.999298 ['Time', 'V01', 'V02', 'V03', 'V04', 'V05', 'V06', 'V07', 'V08', 'V09', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount'] ['Time', 'V01', 'V02', 'V03', 'V04', 'V05', 'V06', 'V07', 'V08', 'V09', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount']

Thus, we could use only fewer features and improve the results of classifications. The best features for the classifications are

In [20]:
print(Back.BLACK + Fore.CYAN + Style.NORMAL + 'Best Features:'+ Style.RESET_ALL + ' %s:' % ', '.join(Best_Features))
Best Features: V01, V03, V06, V07, V10, V11, V12, V14, V16, V17, V19, V21, V26, V28:
In [21]:
rfc = RandomForestClassifier()
_ = rfc.fit(X_train[Best_Features],y_train)
Performance(rfc, X_test[Best_Features])
ROC_Curve(rfc, X_test[Best_Features])
F1 Score Precision Score Recall Score Score
0.999127 0.999111 0.999181 0.999181
In [22]:
# Train set
y_pred = rfc.predict(X_train[Best_Features])
Confusion_Matrix = confusion_matrix(y_train, y_pred)

fig, ax = plt.subplots(1, 2, figsize=(15, 5))
fig.suptitle('Train Set', fontsize = 18)
_ = sns.heatmap(Confusion_Matrix, annot=True, annot_kws={"size": 14}, cmap="Blues", ax = ax[0],
               linewidths = 0.2, vmin=0, vmax=2e4, cbar_kws={"shrink": 1})
_ = ax[0].set_xlabel('Predicted labels')
_ = ax[0].set_ylabel('True labels'); 
_ = ax[0].set_title('Confusion Matrix');
_ = ax[0].xaxis.set_ticklabels(Labels)
_ = ax[0].yaxis.set_ticklabels(Labels)

Confusion_Matrix = Confusion_Matrix.astype('float') / Confusion_Matrix.sum(axis=1)[:, np.newaxis]
_ = sns.heatmap(Confusion_Matrix, annot=True, annot_kws={"size": 14}, cmap="Greens", ax = ax[1],
               linewidths = 0.2, vmin=0, vmax=1, cbar_kws={"shrink": 1})
_ = ax[1].set_xlabel('Predicted labels')
_ = ax[1].set_ylabel('True labels'); 
_ = ax[1].set_title('Normalized Confusion Matrix');
_ = ax[1].xaxis.set_ticklabels(Labels)
_ = ax[1].yaxis.set_ticklabels(Labels)

# Test set
y_pred = rfc.predict(X_test[Best_Features])
Confusion_Matrix = confusion_matrix(y_test, y_pred)

fig, ax = plt.subplots(1, 2, figsize=(15, 5))
fig.suptitle('Test Set', fontsize = 18)
_ = sns.heatmap(Confusion_Matrix, annot=True, annot_kws={"size": 14}, cmap="Blues", ax = ax[0],
                linewidths = 0.2, vmin=0, vmax=9e3, cbar_kws={"shrink": 1})
_ = ax[0].set_xlabel('Predicted labels')
_ = ax[0].set_ylabel('True labels'); 
_ = ax[0].set_title('Confusion Matrix');
_ = ax[0].xaxis.set_ticklabels(Labels)
_ = ax[0].yaxis.set_ticklabels(Labels)

Confusion_Matrix = Confusion_Matrix.astype('float') / Confusion_Matrix.sum(axis=1)[:, np.newaxis]
_ = sns.heatmap(Confusion_Matrix, annot=True, annot_kws={"size": 14}, cmap="Greens", ax = ax[1],
               linewidths = 0.2, vmin=0, vmax=1, cbar_kws={"shrink": 1})
_ = ax[1].set_xlabel('Predicted labels')
_ = ax[1].set_ylabel('True labels'); 
_ = ax[1].set_title('Normalized Confusion Matrix');
_ = ax[1].xaxis.set_ticklabels(Labels)
_ = ax[1].yaxis.set_ticklabels(Labels)

It can be seen overall, Random Forest Classifier performed better in this example. Furthermore, using RFE improves the accuracy of this classification.